This is a first pass at modeling student-level attendence. The primary use cases are: * to provide predictions of end-of-year average daily attendance (ADA) by for each student earlier in the year (say the middle of the 1st quarter). * to provide predictions of end-of-year aveage daily attendance (ADA) by for each school and at the school level earlier in the year (say the middle of the 1st quarter). * Prepare for the addition of an On-Track metric in SQRP.
Another goal is gain familiarity with the tidymodels suite of packages (e.g, recipes, parsnip, resample)
The details on pacakges and data we pull are left to the appendix.
Now we calculate some quantitities of interest:
yearly_ada <- student_att %>%
filter(yearid < 28) %>%
group_by(yearid,
studentid) %>%
summarize(enrolled = sum(enrolled),
absent = sum(absent),
year_end = 1 - (absent/enrolled))
cum_ada <- student_att %>%
filter(yearid < 28) %>%
group_by(yearid,
studentid) %>%
arrange(date) %>%
mutate(cum_enrolled = cumsum(enrolled),
cum_absent = cumsum(absent),
running_ada = 1 - (cum_absent/cum_enrolled)) %>%
filter(cum_enrolled > 0)
yearly_cum_ada <- cum_ada %>%
select(yearid,
schoolid,
grade_level,
studentid,
date,
cum_enrolled,
cum_absent,
running_ada) %>%
dplyr::left_join(yearly_ada %>%
select(-c(enrolled,
absent)),
by = c("yearid",
"studentid"))
yearly_cum_ada %>% filter(studentid == 14852)
Now we begin doing some subsetting to build a basic model. We’ll pull the data for each student over the last few years (excluding SY 18-19) for the last school day in December. We’ll build our model off of this “snapshot”. The resulting table shows for each student their number of days enrolled (cum_enrolled), days absent (cum_absent), ADA through last school day in december (running_ada), and the given year’s EOY ADA (year_end).
dec_dates <- yearly_cum_ada %>%
group_by(yearid) %>%
select(date) %>%
distinct() %>%
filter(lubridate::month(date) == 12) %>%
filter(date == max(date))
## Adding missing grouping variables: `yearid`
yca_filtered <- yearly_cum_ada %>%
inner_join(dec_dates %>%
ungroup() %>%
select(-yearid),
by = c("date")) %>%
ungroup() %>%
mutate(yearid = as.factor(yearid),
schoolid = as.factor(schoolid))
yca_filtered
Our first model (mod_0) models year end ADA as a function of cumulative days enrolled and cumulaitve days absent through the last day of December, using school year as a fixed effect:
\[ y_{i} = f(\alpha + \beta_{e}d_{e} + \beta_{a}d_{a} +\delta_{year} ) \]
We’ll first fit a linear model:
formula_0 <- as.formula(year_end ~ cum_enrolled + cum_absent + yearid)
mod_0 <- lm(formula = formula_0,
data = yca_filtered)
summary(mod_0)
##
## Call:
## lm(formula = formula_0, data = yca_filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.199491 -0.008197 0.004754 0.012723 0.142905
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.139e-01 4.391e-03 208.139 < 2e-16 ***
## cum_enrolled 8.508e-04 5.482e-05 15.521 < 2e-16 ***
## cum_absent -1.050e-02 8.611e-05 -121.913 < 2e-16 ***
## yearid26 1.188e-02 9.109e-04 13.044 < 2e-16 ***
## yearid27 6.599e-03 8.793e-04 7.505 7.22e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02485 on 4986 degrees of freedom
## Multiple R-squared: 0.756, Adjusted R-squared: 0.7558
## F-statistic: 3862 on 4 and 4986 DF, p-value: < 2.2e-16
Let’s plot the ranked residulas of mod_0)
# yca_filtered %>%
# ungroup() %>%
# modelr::add_predictions(mod_0) %>%
# modelr::add_residuals(mod_0) %>%
# arrange(resid) %>%
# group_by(yearid) %>%
# mutate(rank = row_number(resid)) %>%
# ggplot(aes(x = rank, y = resid)) +
# geom_segment(aes(xend = rank, yend = 0),
# size = .075, alph=.3) +
# facet_grid(yearid~.)
plot_ranked_residuals <- function(data,model){
data %>%
modelr::add_predictions(model) %>%
modelr::add_residuals(model) %>%
arrange(resid) %>%
group_by(yearid) %>%
mutate(rank = row_number(resid)) %>%
ggplot(aes(x = rank, y = resid)) +
geom_segment(aes(xend = rank, yend = 0),
size = .075, alpha = .3) +
facet_grid(yearid~.)
}
plot_ranked_residuals(yca_filtered, mod_0)
And heres the residuals as a funtion of actuals.
# yca_filtered %>%
# ungroup() %>%
# modelr::add_predictions(mod_0) %>%
# modelr::add_residuals(mod_0) %>%
# arrange(resid) %>%
# group_by(yearid) %>%
# mutate(rank = row_number(resid)) %>%
# ggplot(aes(x = year_end, y = resid)) +
# geom_point(aes(color = cum_enrolled),
# size = .001) +
# scale_color_kipp(discrete = FALSE) +
# facet_grid(yearid ~.)
plot_actual_v_residuals <- function(data, model){
data %>%
ungroup() %>%
modelr::add_predictions(model) %>%
modelr::add_residuals(model) %>%
arrange(resid) %>%
group_by(yearid) %>%
mutate(rank = row_number(resid)) %>%
ggplot(aes(x = year_end, y = resid)) +
geom_point(aes(color = cum_enrolled),
size = .001) +
scale_color_kipp(discrete = FALSE) +
facet_grid(yearid ~.)
}
plot_actual_v_residuals(yca_filtered, mod_0)
plot_actual_v_predicted <- function(data, model){
data %>%
ungroup() %>%
modelr::add_predictions(model) %>%
modelr::add_residuals(model) %>%
arrange(resid) %>%
group_by(yearid) %>%
mutate(rank = row_number(resid)) %>%
ggplot(aes(x = year_end, y = pred)) +
geom_point(aes(color = cum_enrolled),
size = .001) +
scale_color_kipp(discrete = FALSE) +
facet_grid(yearid ~.)
}
plot_actual_v_predicted(yca_filtered, mod_0)
Let’s fits these with a logit
mod_1 <- glm(formula = formula_0,
family = binomial(link = "logit"),
data = yca_filtered)
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
summary(mod_1)
##
## Call:
## glm(formula = formula_0, family = binomial(link = "logit"), data = yca_filtered)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.74372 -0.05527 0.02874 0.11069 1.36702
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.109887 0.689484 3.060 0.00221 **
## cum_enrolled 0.017037 0.008641 1.972 0.04866 *
## cum_absent -0.129034 0.010785 -11.964 < 2e-16 ***
## yearid26 0.227414 0.170340 1.335 0.18186
## yearid27 0.229005 0.173188 1.322 0.18607
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 229.328 on 4990 degrees of freedom
## Residual deviance: 93.431 on 4986 degrees of freedom
## AIC: 502.16
##
## Number of Fisher Scoring iterations: 6
plot_ranked_residuals(yca_filtered, mod_1)
plot_actual_v_residuals(yca_filtered, mod_1)
yca_filtered %>%
ungroup() %>%
modelr::add_predictions(mod_1) %>%
modelr::add_residuals(mod_1)
We need to partition in train and test to better evaluate our models.
We’ll take rsample for a spin!
yca_split <- yca_filtered %>%
rsample::initial_split(prop = .75)
yca_train <- analysis(yca_split)
yca_test <- assessment(yca_split)
mod_lm_train <- lm(formula = formula_0, data = yca_train)
yca_augmented <- broom::augment(mod_lm_train,
newdata = yca_test) %>%
mutate(resid = year_end - .fitted)
yca_augmented %>%
ggplot(aes(x = year_end, y = .fitted)) +
geom_point(aes(color = cum_enrolled, size = -cum_enrolled)) +
geom_abline(aes(intercept = 0, slope = 1)) +
facet_grid(yearid~.) +
scale_color_kipp(discrete = FALSE)
yca_augmented %>%
ggplot(aes(x = year_end, y = .fitted)) +
geom_point(aes(color = cum_enrolled, size = .se.fit, shape = cum_absent>=16)) +
geom_abline(aes(intercept = 0, slope = 1)) +
facet_grid(yearid~.) +
scale_color_kipp(discrete = FALSE)
formula_0 <- as.formula(year_end ~ cum_enrolled + cum_absent + yearid)
mod_0 <- lm(formula = formula_0,
data = yca_filtered)
formulas <- list(
as.formula(year_end ~ cum_enrolled + cum_absent),
as.formula(year_end ~ cum_enrolled + cum_absent + yearid),
as.formula(year_end ~ cum_enrolled + cum_absent + yearid + schoolid))
list_names <- formulas %>%
map_chr(deparse)
names(formulas) <- list_names
set.seed(1234)
rs_data <- vfold_cv(yca_filtered, v = 10, repeats = 3)
holdout_test <- function(splits, ...){
model <- lm(...,
data = splits)
holdout <- assessment(splits)
res <- broom::augment(model, newdata = holdout)
res
}
map_holdout <- function(data, formula){
data$results <- map(data$splits,
holdout_test,
formula)
data$mod_id <- as.character(deparse(formula))
data
}
run_models <- formulas %>%
map(~{map_holdout(rs_data, .x) %>%
as_tibble()})
calc_rmse <- run_models %>%
map_df(~mutate(.x,
rmse_0 = map2_dbl(results,
results,
~rmse_vec(.x$year_end,
.y$.fitted))))
summarize_rmse <- calc_rmse %>%
group_by(mod_id) %>%
summarize(avg_rmse = mean(rmse_0))
summarize_rmse
holdout_model_spec <- function(splits, model_type, formula, ...){
if(tolower(model_type) == "linear"){
model_spec <- linear_reg() %>%
set_engine("lm")
model <- model_spec %>%
fit(formula,
data = as.data.frame(splits)) #parsnip wants df
}
if(tolower(model_type) == "logistic"){
model_spec <- logistic_reg() %>%
set_engine("glm")
model <- model_spec %>%
fit(formula,
data = as.data.frame(splits))
}
##Requires additional set of variables: mtry, ntrees
# if(tolower(model_type) == "random_forest"){
# model_spec <- rand_forest() %>%
# set_engine("rand_forest")
# model <- model_spec %>%
# fit(...,
# data = splits)
#
# }
if(tolower(model_type) == "knn") {
model_spec <- nearest_neighbor(mode = "regression", ...) %>%
set_engine("kknn")
model <- model_spec %>%
fit(formula, data = as.data.frame(splits))
}
holdout <- assessment(splits)
if(model_type == "knn") {
predictions <- predict(model, new_data = holdout) #won't work with parsnip models
res <- bind_cols(holdout, predictions) %>%
rename(.fitted = .pred) %>%
mutate(.se.fit = as.numeric(NA)) # can't seem to figure out this calcualtion KNN
args <- list(...)
model_type <- sprintf("%s %s", model_type, args$neighbors)
} else {
res <- broom::augment(model$fit, newdata = holdout) #won't work with parsnip models
}
# Convert log-odds to probability measure
if(tolower(model_type) == "logistic") {
res <- res %>% mutate(.fitted = exp(.fitted)/(1+exp(.fitted)))
}
res <- res %>% mutate(model = deparse(formula),
model_type = model_type)
splits$results <- res
}
holdout_model_spec(rs_data$splits[[1]], "linear", formula_0) #works!
holdout_model_spec(rs_data$splits[[1]], "logistic", formula_0) #%>% #.fitted not adjusted
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
#mutate(.fitted = exp(.fitted)/(1+exp(.fitted)))
x<-holdout_model_spec(rs_data$splits[[1]], "knn", formula_0, neighbors = 5)
x
Let’s do a bunch of paramaterizations though Steph’s holdout_model_spec function. To do this we need a function that can take a rsample splits object and then a a list of paramterizations. We’ll do one model type (linear model, logistic, random forest, KNN) one at a time.
The output should have the average RMSE for each model, as well as any other modeling output
Okay, let’s do three things: LM, logistic, and then KNN (with 3, 5, and 10 nearest neighbors):
x_lm <- formulas %>%
map(~{holdout_model_splits(rs_data, "linear", .)})
x_lm_fit_summary <- x_lm %>%
map_df( . %>%
map_df( . %>%
summarize(rmse = rmse_vec(truth = year_end, estimate = .fitted),
model =unique(model),
model_type = unique(model_type)))
)
x_logistic <- formulas %>%
map(~{holdout_model_splits(rs_data, "logistic", .)})
x_logistic_fit_summary <- x_logistic %>%
map_df( . %>%
map_df( . %>%
summarize(rmse = rmse_vec(truth = year_end, estimate = .fitted),
model =unique(model),
model_type = unique(model_type)))
)
# 3 neighbors
x_knn_3 <- formulas %>%
map(~{holdout_model_splits(rs_data, "knn", ., neighbors=3)})
x_knn_3_fit_summary <- x_knn_3 %>%
map_df( . %>%
map_df( . %>%
summarize(rmse = rmse_vec(truth = year_end, estimate = .fitted),
model =unique(model),
model_type = unique(model_type)))
)
# 5 neighbors
x_knn_5 <- formulas %>%
map(~{holdout_model_splits(rs_data, "knn", ., neighbors=5)})
x_knn_5_fit_summary <- x_knn_5 %>%
map_df( . %>%
map_df( . %>%
summarize(rmse = rmse_vec(truth = year_end, estimate = .fitted),
model =unique(model),
model_type = unique(model_type)))
)
# 10 neighbors
x_knn_10 <- formulas %>%
map(~{holdout_model_splits(rs_data, "knn", ., neighbors=10)})
x_knn_10_fit_summary <- x_knn_10 %>%
map_df( . %>%
map_df( . %>%
summarize(rmse = rmse_vec(truth = year_end, estimate = .fitted),
model =unique(model),
model_type = unique(model_type)))
)
x_knn_fit_summary <- bind_rows(x_knn_3_fit_summary, x_knn_5_fit_summary, x_knn_10_fit_summary)
Now we bring all the x_*_fit_summary tables together and plot the results:
x_fit_summary <- bind_rows(x_lm_fit_summary,
x_logistic_fit_summary,
x_knn_fit_summary)
x_fit_summary
x_fit_summary %>%
group_by(model, model_type) %>%
summarize(mean_rmse = mean(rmse)) %>%
arrange(desc(mean_rmse)) %>%
ungroup() %>%
mutate(model = fct_inorder(model),
model_type = fct_inorder(model_type)
) %>%
ggplot() +
geom_col(aes(x = model, y = mean_rmse, fill= model), show.legend = FALSE) +
facet_grid(model_type ~ .) +
coord_flip() +
scale_fill_kipp()
x_fit_summary %>%
mutate(model2 = model) %>%
#ungroup() %>%
#arrange(rmse) %>%
#mutate()
ggplot() +
geom_violin(aes(y = rmse,
x = model2,
fill = model_type
), show.legend = FALSE) +
geom_point(data = x_fit_summary %>%
group_by(model, model_type) %>%
summarize(rmse = mean(rmse)),
aes(x = model, y = rmse),
show.legend = FALSE) +
geom_linerange(data = x_fit_summary %>%
group_by(model, model_type) %>%
summarize(y_min = mean(rmse) - 1*sd(rmse),
y_max = mean(rmse) + 1*sd(rmse)),
aes(x = model, ymin = y_min, ymax = y_max),
show.legend = FALSE) +
facet_grid(model_type ~ .) +
coord_flip() +
scale_fill_kipp()
x_fit_summary %>%
ggplot(aes(x=rmse)) +
geom_density(aes(fill = model_type), show.legend = FALSE) +
geom_segment(data = x_fit_summary %>%
group_by(model, model_type) %>%
summarize(x_min = mean(rmse) - 1*sd(rmse),
x_max = mean(rmse) + 1*sd(rmse)),
aes(y = 0, yend=0, x = x_min, xend = x_max),
size = 1.25,
color = kipp_colors$darkgray,
show.legend = FALSE) +
geom_point(data = x_fit_summary %>%
group_by(model, model_type) %>%
summarize(rmse = mean(rmse)),
aes(x = rmse, y = 0),
show.legend = FALSE,
shape = 21,
fill = "white",
color = kipp_colors$darkgray) +
facet_grid(model ~ model_type, switch = "y") +
scale_fill_kipp() +
theme(strip.text.y = element_text(angle = 180),
axis.text.x = element_text(angle = 45))
library(tidyverse)
library(silounloadr)
library(kippcolors)
library(janitor)
library(lubridate)
library(caret)
library(broom)
library(modelr)
library(tidymodels)
#library(prophet)
theme_set(theme_kipp_light())
bigrquery::set_service_token("kipp-chicago-silo-2-aa786970aefd.json")
knitr::opts_knit$set(comment = FALSE,
warning = FALSE,
progress = FALSE,
verbose = FALSE#,
#width = "90%"
)
Here we grab basic attendances and enrollment data from Silo (our warehouse):
membership <- get_powerschool("ps_membership_reg") %>%
select(studentid,
schoolid,
date = calendardate,
enrolled = studentmembership,
grade_level,
attendance = ATT_CalcCntPresentAbsent,
yearid) %>%
filter(yearid >= 25)
attendance <- get_powerschool("attendance") %>%
filter(yearid >= 25,
att_mode_code == "ATT_ModeDaily")
attendance_code <- get_powerschool("attendance_code") %>%
mutate(att_code = if_else(att_code == "true", "T", att_code)) %>%
select(attendance_codeid = id,
att_code)
Now we join our daily membership and attendance tables and do some pre-processing.
member_att <- membership %>%
dplyr::left_join(attendance %>%
select(-schoolid,
-yearid) %>%
dplyr::left_join(attendance_code,
by = "attendance_codeid"),
by = c("studentid",
"date" = "att_date")) %>%
collect()
Recoding enrollmeent and attendance data:
member_att %>%
janitor::tabyl(att_code)
student_att <- member_att %>%
mutate(enrolled0 = 1,
enrolled = if_else(att_code == "D" & !is.na(att_code), 0, enrolled0),
present0 = ifelse(is.na(att_code), 1, 0),
present1 = ifelse(att_code %in% c("A", "S"), 0, present0),
present2 = ifelse(att_code == "H", 0.5, present1),
present3 = ifelse(att_code %in% c("T", "E", "I"), 1, present2),
present = ifelse(is.na(present2), 1, present3),
absent = (1 - present)*enrolled,
tardy = ifelse(att_code %in% "T", 1, 0),
dna0 = if_else(att_code == "D", 1, 0),
dna = if_else(is.na(dna0), 0, dna0)) %>%
select(yearid,
schoolid,
studentid,
grade_level,
date,
att_code,
enrolled,
present,
absent,
tardy,
dna)
Getting student data:
students <- get_powerschool("students") %>%
select(studentid = id,
student_number,
gender,
entrydate,
schoolentrydate,
districtentrydate,
geocode) %>%
collect()